Draft for the section general stats

Parameters

Inherited parameter and data from the import tab.

filtered_data <- read.csv("../data/data-2023-09-11 (2).csv", header = TRUE)

selected_stats <- c("Ho", "Hs", "Ht", "Fis (W&C)", "Fst (W&C)", "Fis (Nei)", "Fst (Nei)")

n_rep = 100
n_marker = 6
n_pop = 8

sequence_length <- length(6:11)

data filtering convertion

library(hierfstat)
library(adegenet)
filtered_data <- data.frame(indv = paste(substr(filtered_data$Population, 1, 3),
    row.names(filtered_data), sep = "."), filtered_data)
# Create mydata_genind
population <- filtered_data$Population
mydata_genind <- df2genind(X = filtered_data[, 6:11], sep = "/", ncode = 6, ind.names = filtered_data$indv,
    pop = filtered_data$Population, NA.char = "0/0", ploidy = 2, type = "codom",
    strata = NULL, hierarchy = NULL)
mydata_genind
mydata_hierfstat <- genind2hierfstat(mydata_genind)

Missing data

# Libraries
library("poppr")
## Registered S3 method overwritten by 'pegas':
##   method      from
##   print.amova ade4
## This is poppr version 2.9.4. To get started, type package?poppr
## OMP parallel support: available
library("heatmaply")
## Loading required package: plotly
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
## Loading required package: viridis
## Loading required package: viridisLite
## Registered S3 method overwritten by 'dendextend':
##   method     from 
##   rev.hclust vegan
## Registered S3 method overwritten by 'seriation':
##   method         from 
##   reorder.hclust vegan
## 
## ======================
## Welcome to heatmaply version 1.5.0
## 
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
## 
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## You may ask questions at stackoverflow, use the r and heatmaply tags: 
##   https://stackoverflow.com/questions/tagged/heatmaply
## ======================
missing_data <- info_table(mydata_genind, plot = FALSE)

# Matrix format
mat <- as.matrix(missing_data)
# heatmap
p <- heatmaply(mat, dendrogram = "none", xlab = "", ylab = "", main = "", scale = "column",
    margins = c(60, 100, 40, 20), grid_color = "white", grid_width = 1e-05, titleX = FALSE,
    hide_colorbar = TRUE, branches_lwd = 0.1, label_names = c("Population", "Marker",
        "Value"), fontsize_row = 8, fontsize_col = 8, labCol = colnames(mat), labRow = rownames(mat),
    heatmap_layers = theme(axis.line = element_blank()))
p

Run basic.stats and render the result

library(pegas)
## Loading required package: ape
## 
## Attaching package: 'ape'
## The following objects are masked from 'package:hierfstat':
## 
##     pcoa, varcomp
## 
## Attaching package: 'pegas'
## The following object is masked from 'package:ape':
## 
##     mst
## The following object is masked from 'package:ade4':
## 
##     amova
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:ape':
## 
##     where
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tibble)

result <- basic.stats(mydata_hierfstat)  #hierfstat
df_resutl_basic <- as.data.frame(result$perloc)

# Weir and Cockrham estimates of Fstatistics - FIS and FST
result_f_stats <- Fst(as.loci(mydata_genind), na.alleles = "")  #pegas
result_f_stats <- result_f_stats[, 2:3]
colnames(result_f_stats) <- c("Fst (W&C)", "Fis (W&C)")
result_f_stats <- merge(result_f_stats, df_resutl_basic, by = "row.names", all.x = TRUE)
colnames(result_f_stats)[10] <- "Fst (Nei)"
colnames(result_f_stats)[12] <- "Fis (Nei)"
result_f_stats <- result_f_stats %>%
    column_to_rownames(., var = "Row.names")
result_f_stats_selec <- result_f_stats %>%
    select(all_of(selected_stats))
result_f_stats_selec

G-statistic

library(hrbrthemes)
library(ggplot2)

# compute the Gstats
result_f_stats <- result_f_stats %>%
    mutate(GST = 1 - Hs/Ht)
result_f_stats <- result_f_stats %>%
    mutate(`GST''` = (n_pop * (Ht - Hs))/((n_pop * Hs - Ht) * (1 - Hs)))
result_f_stats
# Plot with linear trend
p2 <- ggplot(result_f_stats, aes(x = GST, y = Hs)) + geom_point() + geom_smooth(method = lm,
    color = "red", se = FALSE) + theme_ipsum()
p2
## `geom_smooth()` using formula = 'y ~ x'

HW - Panmixia

with pegas

library("pegas")
hw.test(as.loci(mydata_genind), B = n_rep)
##         chi^2 df Pr(chi^2 >) Pr.exact
## B12 693.49866 15   0.0000000     0.00
## C07  27.78220 28   0.4760334     0.09
## D12 799.33270 66   0.0000000     0.00
## D10 492.53039 28   0.0000000     0.00
## A12  11.89948 10   0.2918403     0.26
## C03  56.70154 36   0.0153672     0.10

with genepop

library(radiator)
# https://thierrygosselin.github.io/radiator/reference/genomic_converter.html
# https://thierrygosselin.github.io/radiator/articles/rad_genomics_computer_setup.html

# mydata1 <- genomic_converter(mydata_genind, parallel.core =
# parallel::detectCores() - 1, output = 'genepop', filename =
# 'mydata.genepop.txt')

library(genepop)
# https://cran.r-project.org/web/packages/genepop/genepop.pdf genepop_HW <-
# test_HW( inputFile =
# '05_radiator_genomic_converter_20231017@1623/mydata.genepop.txt', which =
# 'Proba', outputFile = '', settingsFile = '', enumeration = FALSE,
# dememorization = 10000, batches = 20, iterations = 5000, verbose =
# interactive() )

Linkage Disequilibrium

I have found different values from Fstats.

with poppr

The index of association was originally developed by A.H.D. Brown analyzing population structure of wild barley (Brown, 1980).

Ia: The index of association. p.Ia: The p-value resulting from a one-sided permutation test based on the number of samples indicated in the original call. rbard: The standardized index of association. p.rd: The p-value resulting from a one-sided permutation test based on the number of samples indicated in the original call.

pair.ia calculates the index of association in a pairwise manner among all loci.

loci_pair <- pair.ia(mydata_genind, sample = n_rep, quiet = FALSE, method = 1,plot = FALSE)
loci_pair
##              Ia    p.Ia   rbarD    p.rD
## B12:C07 -0.0039  0.9901 -0.0039  0.9901
## B12:D12  0.0545  0.8317  0.0547  0.8317
## B12:D10  0.0204  0.2079  0.0204  0.2079
## B12:A12  0.0356  0.0099  0.0356  0.0099
## B12:C03  0.0531  0.1683  0.0534  0.1782
## C07:D12 -0.0233  0.8317 -0.0233  0.8317
## C07:D10  0.0172  0.4059  0.0172  0.4059
## C07:A12  0.0364  0.5941  0.0365  0.5941
## C07:C03  0.1252  0.0693  0.1254  0.0693
## D12:D10  0.0708  0.0099  0.0709  0.0198
## D12:A12  0.0244  0.1584  0.0245  0.1584
## D12:C03 -0.0135  0.6139 -0.0135  0.6139
## D10:A12  0.0170  0.3465  0.0170  0.3465
## D10:C03  0.0456  0.1089  0.0458  0.1089
## A12:C03  0.0284  0.0990  0.0287  0.0990

with genepop

“Genotypes at one locus are independent from genotypes at the other locus”. For a pair of diploid loci, no assumption is made about the gametic phase in double heterozygotes. In particular, it is not inferred assuming one-locus HW equilibrium, as such equilibrium is not assumed anywhere in the formulation of the test. The test is thus one of association between diploid genotypes at both loci, sometimes described as a test of the composite linkage disequilibrium (Bruce S. Weir 1996, 126–28). Contingency tables are created for all pairs of loci in each sample, then a G test or a probability test for each table is computed for each table using the Markov chain algorithm of Raymond and Rousset (1995a). The number of switches of the algorithm is given for each table analyzed.

library(genepop)
outfile <- test_LD(inputFile = "05_radiator_genomic_converter_20231017@1623/mydata.genepop.txt",
    outputFile = "", settingsFile = "", dememorization = 10000, batches = 100, iterations = n_rep,
    verbose = interactive())
readLines(outfile)[136:155]
##  [1] "P-value for each locus pair across all populations"   
##  [2] "(Fisher's method)"                                    
##  [3] "-----------------------------------------------------"
##  [4] "Locus pair                    Chi2      df   P-Value" 
##  [5] "--------------------          --------  ---  --------"
##  [6] "A12           & B12           9.909120  16   0.871331"
##  [7] "A12           & C03           21.451489 16   0.161801"
##  [8] "B12           & C03           21.526163 16   0.159161"
##  [9] "A12           & C07           8.073544  16   0.946647"
## [10] "B12           & C07           15.275568 16   0.504556"
## [11] "C03           & C07           19.524641 16   0.242397"
## [12] "A12           & D10           13.526471 16   0.633945"
## [13] "B12           & D10           8.497647  16   0.932653"
## [14] "C03           & D10           17.076849 16   0.380641"
## [15] "C07           & D10           12.299711 16   0.723103"
## [16] "A12           & D12           12.043688 16   0.740967"
## [17] "B12           & D12           12.438999 16   0.713248"
## [18] "C03           & D12           6.951916  16   0.974177"
## [19] "C07           & D12           28.672296 16   0.026242"
## [20] "D10           & D12           14.840108 16   0.536379"

with pegas

LD2 is based on the observed frequencies of different genotypes (Schaid 2004).

mat_alleles <- filtered_data %>%
    select(Population)
mat_alleles <- cbind(mat_alleles, filtered_data[, 6:11])
mat_alleles_loci <- alleles2loci(mat_alleles, ploidy = 1, population = 1, phased = FALSE)
linkage_pegas <- LD2(mat_alleles_loci)
print(linkage_pegas$T2)
##          T2          df       P-val 
## 69.43451606 48.00000000  0.02313167

DEVELOPMENT

not ready for deployment

panmixia Fstat - shuffle df

#shuffled_matrices <- replicate(n_rep, mat[sample(nrow(mat)), ], simplify = FALSE)
shuffled_matrices <- replicate(n_rep, mat[sample(length(mat), replace = FALSE)], simplify = FALSE)
##################
# shuffle only the genotype and add the pop column later for each matrices. 
#in the loop? 
###############


# Create a list to store the wc
fst_df <- numeric(sequence_length)
fis_df <- numeric(sequence_length)

# Calculate the average for each shuffled matrix

# Iterate through the shuffled matrices
for (i in 1:n_rep) {
  # Calculate the statistics for the i-th matrix
  #HERE THE COLUMN POP
  merged_df <- cbind(level_pop, shuffled_matrices[[i]])
  result_f_stats <- wc(shuffled_matrices[[i]]) 
  result_f_stats <- as.data.frame(result_f_stats$per.loc)
  # Extract FST and FIS values
  fst_values <- result_f_stats$FST
  fis_values <- result_f_stats$FIS
  print( fst_values)
  # Assign values to the data frames
  fst_df <- cbind(fst_df, result_f_stats$FST)
  fis_df <- cbind(fis_df, result_f_stats$FIS)
}

# Set row names as in result_f_stats

rownames(fst_df) <- rownames(fis_df) <- rownames(result_f_stats)
result_FST <- fst_df[, -1]
fis_df <-fis_df[, -1]
vec <- seq(1, n_rep)
colnames(result_FST) <- colnames(fis_df) <- vec


result_FST[1,] 

count (result_f_stats[,1][1] > result_FST[1,] )
count <- sum(result_f_stats[,1][1] > result_FST[1, ])



# Initialize an empty data frame to store the counts
count_df <- data.frame(
  Greater = numeric(length(result_FST)),
  Smaller = numeric(length(result_FST))
)

# Compare the values in result_f_stats[1] to result_FST for each column
for (col in colnames(result_FST)) {
  greater_count <- sum(result_f_stats[1] > result_FST[, col])
  smaller_count <- sum(result_f_stats[1] < result_FST[, col])
  count_df$Greater[col] <- greater_count
  count_df$Smaller[col] <- smaller_count
}

# Print the count data frame
print(count_df)

######################## ######################## 

File export

library(radiator)

genomic_converter(
  data,
  strata = NULL,
  output = NULL,
  filename = NULL,
  parallel.core = parallel::detectCores() - 1,
  verbose = TRUE,
  ...
)